In [64]:
library(tidyverse)
library(nycflights13)

In [ ]:
flights %>% head() 
airlines %>% head() # 항공사
airports %>% head() # 공항
planes %>% head() # 여객기
weather %>% head() # 공항의 매 시각 날씨 정보
In [65]:
flights_latlon <- flights %>%
  inner_join(select(airports, origin = faa, origin_lat = lat, origin_lon = lon),
    by = "origin"
  ) %>%
  inner_join(select(airports, dest = faa, dest_lat = lat, dest_lon = lon),
    by = "dest"
  )
flights_latlon %>%
  slice(1:100) %>%
  ggplot(aes(
    x = origin_lon, xend = dest_lon,
    y = origin_lat, yend = dest_lat
  )) +
  borders("state") +
  geom_segment(arrow = arrow(length = unit(0.1, "cm"))) +
  coord_quickmap() +
  labs(y = "Latitude", x = "Longitude")

key¶

  • 기본키 (primary key) : 자신의 테이블에서 관측값을 고유하게 식별
  • 외래키 (foreign key) : 다른 테이블의 관측값을 고유하게 식별, 다른 테이블의 열과 매칭
  • 대체키 (surrogate key)
  • 기본키와 이와 대응되는 다른 테이블의 외래키는 관계를 형성 [일대다 (one-to-many)]
In [66]:
planes %>% head()
A tibble: 6 × 9
tailnumyeartypemanufacturermodelenginesseatsspeedengine
<chr><int><chr><chr><chr><int><int><int><chr>
N101562004Fixed wing multi engineEMBRAER EMB-145XR2 55NATurbo-fan
N102UW1998Fixed wing multi engineAIRBUS INDUSTRIEA320-214 2182NATurbo-fan
N103US1999Fixed wing multi engineAIRBUS INDUSTRIEA320-214 2182NATurbo-fan
N104UW1999Fixed wing multi engineAIRBUS INDUSTRIEA320-214 2182NATurbo-fan
N105752002Fixed wing multi engineEMBRAER EMB-145LR2 55NATurbo-fan
N105UW1999Fixed wing multi engineAIRBUS INDUSTRIEA320-214 2182NATurbo-fan
In [67]:
planes %>% count(tailnum) %>% filter(n > 1)
A tibble: 0 × 2
tailnumn
<chr><int>
In [68]:
weather %>% count(year, month, day, hour, origin) %>% filter(n > 1)
A tibble: 3 × 6
yearmonthdayhouroriginn
<int><int><int><int><chr><int>
20131131EWR2
20131131JFK2
20131131LGA2
In [69]:
flights %>% count(year, month, day, flight) %>% filter(n > 1) %>% head()
A tibble: 6 × 5
yearmonthdayflightn
<int><int><int><int><int>
201311 12
201311 32
201311 42
201311113
201311152
201311212
In [70]:
flights %>% count(year, month, day, tailnum) %>% filter(n > 1) %>% head()
A tibble: 6 × 5
yearmonthdaytailnumn
<int><int><int><chr><int>
201311N0EGMQ2
201311N111892
201311N115362
201311N115443
201311N115512
201311N125402
In [71]:
# 10.3.1
# 1 
flights %>%
  arrange(year, month, day, sched_dep_time, carrier, flight) %>%
  mutate(flight_id = row_number()) %>%
  head()
A tibble: 6 × 20
yearmonthdaydep_timesched_dep_timedep_delayarr_timesched_arr_timearr_delaycarrierflighttailnumorigindestair_timedistancehourminutetime_hourflight_id
<int><int><int><int><int><dbl><int><int><dbl><chr><int><chr><chr><chr><dbl><dbl><dbl><dbl><dttm><int>
201311517515 2 830 819 11UA1545N14228EWRIAH22714005152013-01-01 05:00:001
201311533529 4 850 830 20UA1714N24211LGAIAH22714165292013-01-01 05:00:002
201311542540 2 923 850 33AA1141N619AAJFKMIA16010895402013-01-01 05:00:003
201311544545-110041022-18B6 725N804JBJFKBQN18315765452013-01-01 05:00:004
201311554558-4 740 728 12UA1696N39463EWRORD150 7195582013-01-01 05:00:005
201311559559 0 702 706 -4B61806N708JBJFKBOS 44 1875592013-01-01 05:00:006
In [9]:
# 2 
library(Lahman)
Batting %>% count(playerID, yearID, stint) %>% filter(n > 1)
A data.frame: 0 × 4
playerIDyearIDstintn
<chr><int><int><int>
In [10]:
library(babynames)
babynames %>% count(year, sex, name) %>% filter(n > 1) %>% nrow()
0
In [11]:
nasaweather::atmos %>%
  count(lat, long, year, month) %>%
  filter(n > 1) %>% head()
A tibble: 0 × 5
latlongyearmonthn
<dbl><dbl><int><int><int>
In [12]:
fueleconomy::vehicles %>%
  count(id) %>%
  filter(n > 1) %>% head()
A tibble: 0 × 2
idn
<dbl><int>
In [13]:
diamonds <- mutate(ggplot2::diamonds, id = row_number())
head(diamonds)
A tibble: 6 × 11
caratcutcolorclaritydepthtablepricexyzid
<dbl><ord><ord><ord><dbl><dbl><int><dbl><dbl><dbl><int>
0.23Ideal ESI2 61.5553263.953.982.431
0.21Premium ESI1 59.8613263.893.842.312
0.23Good EVS1 56.9653274.054.072.313
0.29Premium IVS2 62.4583344.204.232.634
0.31Good JSI2 63.3583354.344.352.755
0.24Very GoodJVVS262.8573363.943.962.486
In [14]:
# 3 
head(Master) ; head(Batting) ; head(Salaries)
A data.frame: 6 × 26
playerIDbirthYearbirthMonthbirthDaybirthCountrybirthStatebirthCitydeathYeardeathMonthdeathDay...weightheightbatsthrowsdebutfinalGameretroIDbbrefIDdeathDatebirthDate
<chr><int><int><int><chr><chr><chr><int><int><int>...<int><int><fct><fct><chr><chr><chr><chr><date><date>
1aardsda0119811227USA CO Denver NANANA...21575RR2004-04-062015-08-23aardd001aardsda01NA1981-12-27
2aaronha011934 2 5USA AL Mobile 2021 122...18072RR1954-04-131976-10-03aaroh101aaronha012021-01-221934-02-05
3aaronto011939 8 5USA AL Mobile 1984 816...19075RR1962-04-101971-09-26aarot101aaronto011984-08-161939-08-05
4aasedo01 1954 9 8USA CA Orange NANANA...19075RR1977-07-261990-10-03aased001aasedo01 NA1954-09-08
5abadan01 1972 825USA FL Palm Beach NANANA...18473LL2001-09-102006-04-13abada001abadan01 NA1972-08-25
6abadfe01 19851217D.R.La RomanaLa Romana NANANA...23574LL2010-07-282019-09-28abadf001abadfe01 NA1985-12-17
A data.frame: 6 × 22
playerIDyearIDstintteamIDlgIDGABRHX2B...RBISBCSBBSOIBBHBPSHSFGIDP
<chr><int><int><fct><fct><int><int><int><int><int>...<int><int><int><int><int><int><int><int><int><int>
1abercda0118711TRONA 1 4 0 0 0... 00000NANANANA0
2addybo01 18711RC1NA251183032 6...138140NANANANA0
3allisar0118711CL1NA291372840 4...193125NANANANA1
4allisdo0118711WS3NA27133284410...271102NANANANA0
5ansonca0118711RC1NA25120293911...166221NANANANA0
6armstbo0118711FW1NA12 49 911 2... 50101NANANANA0
A data.frame: 6 × 5
yearIDteamIDlgIDplayerIDsalary
<int><fct><fct><chr><int>
11985ATLNLbarkele01870000
21985ATLNLbedrost01550000
31985ATLNLbenedbr01545000
41985ATLNLcampri01 633333
51985ATLNLceronri01625000
61985ATLNLchambch01800000

Master¶

  • Primary key : playerID, Batting

Batting¶

  • Primary key : playerID, yearID, stint
  • Foreign keys : playerID = Master$playerID (many-to-1)

Salaries¶

  • Primary key : yearID, teamID, playerID
  • Foreign keys : playerID = Master$playerID (many-to-1)

In [72]:
library(datamodelr)
dm1 <- dm_from_data_frames(list(
  Batting = Lahman::Batting,
  Master = Lahman::Master,
  Salaries = Lahman::Salaries
)) %>%
  dm_set_key("Master", "playerID") %>%
  dm_set_key("Batting", c("playerID", "yearID", "stint")) %>%
  dm_set_key("Salaries", c("yearID", "teamID", "playerID")) %>%
  dm_add_references(
    Batting$playerID == Master$playerID,
    Salaries$playerID == Master$playerID
  )

dm_create_graph(dm1, rankdir = "LR", columnArrows = TRUE) %>%
  dm_render_graph()
In [73]:
head(Master) ; head(Managers) ; head(AwardsManagers)
A data.frame: 6 × 26
playerIDbirthYearbirthMonthbirthDaybirthCountrybirthStatebirthCitydeathYeardeathMonthdeathDay...weightheightbatsthrowsdebutfinalGameretroIDbbrefIDdeathDatebirthDate
<chr><int><int><int><chr><chr><chr><int><int><int>...<int><int><fct><fct><chr><chr><chr><chr><date><date>
1aardsda0119811227USA CO Denver NANANA...21575RR2004-04-062015-08-23aardd001aardsda01NA1981-12-27
2aaronha011934 2 5USA AL Mobile 2021 122...18072RR1954-04-131976-10-03aaroh101aaronha012021-01-221934-02-05
3aaronto011939 8 5USA AL Mobile 1984 816...19075RR1962-04-101971-09-26aarot101aaronto011984-08-161939-08-05
4aasedo01 1954 9 8USA CA Orange NANANA...19075RR1977-07-261990-10-03aased001aasedo01 NA1954-09-08
5abadan01 1972 825USA FL Palm Beach NANANA...18473LL2001-09-102006-04-13abada001abadan01 NA1972-08-25
6abadfe01 19851217D.R.La RomanaLa Romana NANANA...23574LL2010-07-282019-09-28abadf001abadfe01 NA1985-12-17
A data.frame: 6 × 10
playerIDyearIDteamIDlgIDinseasonGWLrankplyrMgr
<chr><int><fct><fct><int><int><int><int><int><fct>
1wrighha011871BS1NA13120103Y
2woodji01 1871CH1NA12819 92Y
3paborch011871CL1NA12910198Y
4lennobi011871FW1NA114 5 98Y
5deaneha011871FW1NA2 5 2 38Y
6fergubo011871NY2NA13316175Y
A data.frame: 6 × 6
playerIDawardIDyearIDlgIDtienotes
<chr><chr><int><fct><chr><lgl>
1larusto01BBWAA Manager of the Year1983ALNANA
2lasorto01BBWAA Manager of the Year1983NLNANA
3andersp01BBWAA Manager of the Year1984ALNANA
4freyji99 BBWAA Manager of the Year1984NLNANA
5coxbo01 BBWAA Manager of the Year1985ALNANA
6herzowh01BBWAA Manager of the Year1985NLNANA

Master¶

  • Primary key : playerID

Managers¶

  • Primary key : yearID, teamID, inseason
  • Foreign keys : playerID references Master$playerID (many-to-1)

AwardsManagers:¶

  • Primary key : playerID, awardID, yearID
  • Foreign keys : playerID references Master$playerID (many-to-1)

In [74]:
dm2 <- dm_from_data_frames(list(
  Master = Lahman::Master,
  Managers = Lahman::Managers,
  AwardsManagers = Lahman::AwardsManagers
)) %>%
  dm_set_key("Master", "playerID") %>%
  dm_set_key("Managers", c("yearID", "teamID", "inseason")) %>%
  dm_set_key("AwardsManagers", c("playerID", "awardID", "yearID")) %>%
  dm_add_references(
    Managers$playerID == Master$playerID,
    AwardsManagers$playerID == Master$playerID
  )

dm_create_graph(dm2, rankdir = "LR", columnArrows = TRUE) %>%
  dm_render_graph()
In [75]:
head(Batting) ; head(Fielding) ; head(Pitching)
A data.frame: 6 × 22
playerIDyearIDstintteamIDlgIDGABRHX2B...RBISBCSBBSOIBBHBPSHSFGIDP
<chr><int><int><fct><fct><int><int><int><int><int>...<int><int><int><int><int><int><int><int><int><int>
1abercda0118711TRONA 1 4 0 0 0... 00000NANANANA0
2addybo01 18711RC1NA251183032 6...138140NANANANA0
3allisar0118711CL1NA291372840 4...193125NANANANA1
4allisdo0118711WS3NA27133284410...271102NANANANA0
5ansonca0118711RC1NA25120293911...166221NANANANA0
6armstbo0118711FW1NA12 49 911 2... 50101NANANANA0
A data.frame: 6 × 18
playerIDyearIDstintteamIDlgIDPOSGGSInnOutsPOAEDPPBWPSBCSZR
<chr><int><int><fct><fct><chr><int><int><int><int><int><int><int><int><int><int><int><int>
1abercda0118711TRONASS 1 1 24 1 3 20NANANANANA
2addybo01 18711RC1NA2B22226066772425NANANANANA
3addybo01 18711RC1NASS 3 3 96 814 70NANANANANA
4allisar0118711CL1NA2B 2 0 18 1 4 00NANANANANA
5allisar0118711CL1NAOF292972951 3 71NANANANANA
6allisdo0118711WS3NAC 2727681681520418NA 0 0NA
A data.frame: 6 × 30
playerIDyearIDstintteamIDlgIDWLGGSCG...IBBWPHBPBKBFPGFRSHSFGIDP
<chr><int><int><fct><fct><int><int><int><int><int>...<int><int><int><int><int><int><int><int><int><int>
1bechtge0118711PH1NA 1 2 3 3 2...NA 7NA0 1460 42NANANA
2brainas0118711WS3NA1215303030...NA 7NA012910292NANANA
3fergubo0118711NY2NA 0 0 1 0 0...NA 2NA0 140 9NANANA
4fishech0118711RC1NA 416242422...NA20NA010801257NANANA
5fleetfr0118711NY2NA 0 1 1 1 1...NA 0NA0 570 21NANANA
6flowedi0118711TRONA 0 0 1 0 0...NA 0NA0 31 0NANANA

Mutating joins¶

In [76]:
flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier)
flights2 %>% head()
head(airlines)
A tibble: 6 × 8
yearmonthdayhourorigindesttailnumcarrier
<int><int><int><dbl><chr><chr><chr><chr>
2013115EWRIAHN14228UA
2013115LGAIAHN24211UA
2013115JFKMIAN619AAAA
2013115JFKBQNN804JBB6
2013116LGAATLN668DNDL
2013115EWRORDN39463UA
A tibble: 6 × 2
carriername
<chr><chr>
9EEndeavor Air Inc.
AAAmerican Airlines Inc.
ASAlaska Airlines Inc.
B6JetBlue Airways
DLDelta Air Lines Inc.
EVExpressJet Airlines Inc.
In [20]:
flights2 %>% select(-origin, -dest) %>% left_join(airlines, by = "carrier") %>% head()
A tibble: 6 × 7
yearmonthdayhourtailnumcarriername
<int><int><int><dbl><chr><chr><chr>
2013115N14228UAUnited Air Lines Inc.
2013115N24211UAUnited Air Lines Inc.
2013115N619AAAAAmerican Airlines Inc.
2013115N804JBB6JetBlue Airways
2013116N668DNDLDelta Air Lines Inc.
2013115N39463UAUnited Air Lines Inc.
In [21]:
x <- tribble(
    ~key, ~val_x,
    1, "x1",
    2, "x2",
    3, "x3"
)
y <- tribble(
    ~key, ~val_y,
    1, "y1",
    2, "y2",
    4, "y3"
)
x ; y
A tibble: 3 × 2
keyval_x
<dbl><chr>
1x1
2x2
3x3
A tibble: 3 × 2
keyval_y
<dbl><chr>
1y1
2y2
4y3

내부 조인 (inner join)¶

In [22]:
x %>% inner_join(y, by = "key")
A tibble: 2 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
2x2y2

외부 조인 (outer join)¶

  • left_join() : 좌측의 모든 관측값을 보존
  • right_join() : 우측의 모든 관측값을 보존
  • full_join() : 좌측과 우측의 모든 관측값을 보존
  • 존재하지 않는 값은 NA로 대체

중복키¶

경우 1¶

  • 하나의 테이블에 중복키가 있다. 중복키는 추가적인 정보를 넣을 때 유용한 데 일반적으로 일대다 관계가 있기 떄문이다.
In [39]:
x <- tribble(
    ~key, ~val_x,
    1, "x1",
    2, "x2", 
    2, "x3",
    1, "x4"
)
y <- tribble(
    ~key, ~val_y,
    1, "y1", 
    2, "y2"
)

left_join(x, y, by = "key")
A tibble: 4 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
2x2y2
2x3y2
1x4y1

경우 2¶

  • 두 테이블 모두 중복키가 있다. 키가 어느 테이블에서도 고유하게 관측값을 식별하지 않기 때문에 이것은 일반적 오류이다. 중복키로 조인하면 가능한 모든 조합인 데카르트곱을 얻음
In [40]:
x <- tribble(
    ~key, ~val_x,
    1, "x1",
    2, "x2", 
    2, "x3",
    3, "x4"
)
y <- tribble(
    ~key, ~val_y,
    1, "y1", 
    2, "y2",
    2, "y3",
    4, "y4"
)

left_join(x, y, by = "key")
A tibble: 6 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
2x2y2
2x2y3
2x3y2
2x3y3
3x4NA

자연 조인 (natural join)¶

  • by = NULL을 사용하면 두 테이블에 있는 모든 변수를 사용
In [77]:
flights2 %>% left_join(weather) %>% head()
Joining, by = c("year", "month", "day", "hour", "origin")
A tibble: 6 × 18
yearmonthdayhourorigindesttailnumcarriertempdewphumidwind_dirwind_speedwind_gustprecippressurevisibtime_hour
<int><int><int><dbl><chr><chr><chr><chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dttm>
2013115EWRIAHN14228UA39.0228.0464.4326012.65858 NA01011.9102013-01-01 05:00:00
2013115LGAIAHN24211UA39.9224.9854.8125014.9601421.8648201011.4102013-01-01 05:00:00
2013115JFKMIAN619AAAA39.0226.9661.6326014.96014 NA01012.1102013-01-01 05:00:00
2013115JFKBQNN804JBB639.0226.9661.6326014.96014 NA01012.1102013-01-01 05:00:00
2013116LGAATLN668DNDL39.9224.9854.8126016.1109223.0156001011.7102013-01-01 06:00:00
2013115EWRORDN39463UA39.0228.0464.4326012.65858 NA01011.9102013-01-01 05:00:00
  • by = "x" : 이루 공통 변수만 사용
In [78]:
flights2 %>% left_join(planes, by = "tailnum") %>% head()
A tibble: 6 × 16
year.xmonthdayhourorigindesttailnumcarrieryear.ytypemanufacturermodelenginesseatsspeedengine
<int><int><int><dbl><chr><chr><chr><chr><int><chr><chr><chr><int><int><int><chr>
2013115EWRIAHN14228UA1999Fixed wing multi engineBOEING737-824 2149NATurbo-fan
2013115LGAIAHN24211UA1998Fixed wing multi engineBOEING737-824 2149NATurbo-fan
2013115JFKMIAN619AAAA1990Fixed wing multi engineBOEING757-223 2178NATurbo-fan
2013115JFKBQNN804JBB62012Fixed wing multi engineAIRBUSA320-232 2200NATurbo-fan
2013116LGAATLN668DNDL1991Fixed wing multi engineBOEING757-232 2178NATurbo-fan
2013115EWRORDN39463UA2012Fixed wing multi engineBOEING737-924ER2191NATurbo-fan
  • by = c("a" = "b") : 테이블 x의 변수 a와 테이블 y의 변수 b를 매칭
In [79]:
flights2 %>% left_join(airports, c("dest" = "faa")) %>% head()
A tibble: 6 × 15
yearmonthdayhourorigindesttailnumcarriernamelatlonalttzdsttzone
<int><int><int><dbl><chr><chr><chr><chr><chr><dbl><dbl><dbl><dbl><chr><chr>
2013115EWRIAHN14228UAGeorge Bush Intercontinental 29.98443-95.34144 97-6A America/Chicago
2013115LGAIAHN24211UAGeorge Bush Intercontinental 29.98443-95.34144 97-6A America/Chicago
2013115JFKMIAN619AAAAMiami Intl 25.79325-80.29056 8-5A America/New_York
2013115JFKBQNN804JBB6NA NA NA NANANANA
2013116LGAATLN668DNDLHartsfield Jackson Atlanta Intl33.63672-84.428071026-5A America/New_York
2013115EWRORDN39463UAChicago Ohare Intl 41.97860-87.90484 668-6A America/Chicago
In [80]:
flights2 %>% left_join(airports, c("origin" = "faa")) %>% head()
A tibble: 6 × 15
yearmonthdayhourorigindesttailnumcarriernamelatlonalttzdsttzone
<int><int><int><dbl><chr><chr><chr><chr><chr><dbl><dbl><dbl><dbl><chr><chr>
2013115EWRIAHN14228UANewark Liberty Intl40.69250-74.1686718-5AAmerica/New_York
2013115LGAIAHN24211UALa Guardia 40.77725-73.8726122-5AAmerica/New_York
2013115JFKMIAN619AAAAJohn F Kennedy Intl40.63975-73.7789313-5AAmerica/New_York
2013115JFKBQNN804JBB6John F Kennedy Intl40.63975-73.7789313-5AAmerica/New_York
2013116LGAATLN668DNDLLa Guardia 40.77725-73.8726122-5AAmerica/New_York
2013115EWRORDN39463UANewark Liberty Intl40.69250-74.1686718-5AAmerica/New_York
In [81]:
# 10.4.6
# 1
airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
  borders("state") +
  geom_point() +
  coord_quickmap()
In [82]:
avg_dest_delays <- flights %>% group_by(dest) %>% 
    summarise(delay = mean(arr_delay, na.rm = T)) %>% 
    inner_join(airports, by = c("dest" = "faa"))
In [83]:
avg_dest_delays %>% head
A tibble: 6 × 9
destdelaynamelatlonalttzdsttzone
<chr><dbl><chr><dbl><dbl><dbl><dbl><chr><chr>
ABQ 4.381890Albuquerque International Sunport35.04022-106.609195355-7AAmerica/Denver
ACK 4.852273Nantucket Mem 41.25305 -70.06018 48-5AAmerica/New_York
ALB14.397129Albany Intl 42.74827 -73.80169 285-5AAmerica/New_York
ANC-2.500000Ted Stevens Anchorage Intl 61.17436-149.99636 152-9AAmerica/Anchorage
ATL11.300113Hartsfield Jackson Atlanta Intl 33.63672 -84.428071026-5AAmerica/New_York
AUS 6.019909Austin Bergstrom Intl 30.19453 -97.66989 542-6AAmerica/Chicago
In [84]:
avg_dest_delays %>%
  ggplot(aes(lon, lat, colour = delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap()
In [85]:
# 2 
airport_locations <- airports %>%
  select(faa, lat, lon)

flights %>%
  select(year:day, hour, origin, dest) %>%
  left_join(
    airport_locations,
    by = c("origin" = "faa")
  ) %>%
  left_join(
    airport_locations,
    by = c("dest" = "faa")
  ) %>% print
# A tibble: 336,776 x 10
    year month   day  hour origin dest  lat.x lon.x lat.y lon.y
   <int> <int> <int> <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <dbl>
 1  2013     1     1     5 EWR    IAH    40.7 -74.2  30.0 -95.3
 2  2013     1     1     5 LGA    IAH    40.8 -73.9  30.0 -95.3
 3  2013     1     1     5 JFK    MIA    40.6 -73.8  25.8 -80.3
 4  2013     1     1     5 JFK    BQN    40.6 -73.8  NA    NA  
 5  2013     1     1     6 LGA    ATL    40.8 -73.9  33.6 -84.4
 6  2013     1     1     5 EWR    ORD    40.7 -74.2  42.0 -87.9
 7  2013     1     1     6 EWR    FLL    40.7 -74.2  26.1 -80.2
 8  2013     1     1     6 LGA    IAD    40.8 -73.9  38.9 -77.5
 9  2013     1     1     6 JFK    MCO    40.6 -73.8  28.4 -81.3
10  2013     1     1     6 LGA    ORD    40.8 -73.9  42.0 -87.9
# ... with 336,766 more rows
In [86]:
airport_locations <- airports %>%
  select(faa, lat, lon)

flights %>%
  select(year:day, hour, origin, dest) %>%
  left_join(
    airport_locations,
    by = c("origin" = "faa")
  ) %>%
  left_join(
    airport_locations,
    by = c("dest" = "faa"),
    suffix = c("_origin", "_dest")
  ) %>% print
# A tibble: 336,776 x 10
    year month   day  hour origin dest  lat_origin lon_origin lat_dest lon_dest
   <int> <int> <int> <dbl> <chr>  <chr>      <dbl>      <dbl>    <dbl>    <dbl>
 1  2013     1     1     5 EWR    IAH         40.7      -74.2     30.0    -95.3
 2  2013     1     1     5 LGA    IAH         40.8      -73.9     30.0    -95.3
 3  2013     1     1     5 JFK    MIA         40.6      -73.8     25.8    -80.3
 4  2013     1     1     5 JFK    BQN         40.6      -73.8     NA       NA  
 5  2013     1     1     6 LGA    ATL         40.8      -73.9     33.6    -84.4
 6  2013     1     1     5 EWR    ORD         40.7      -74.2     42.0    -87.9
 7  2013     1     1     6 EWR    FLL         40.7      -74.2     26.1    -80.2
 8  2013     1     1     6 LGA    IAD         40.8      -73.9     38.9    -77.5
 9  2013     1     1     6 JFK    MCO         40.6      -73.8     28.4    -81.3
10  2013     1     1     6 LGA    ORD         40.8      -73.9     42.0    -87.9
# ... with 336,766 more rows

여객기의 나이와 지연 시간 사이의 관계¶

In [87]:
# 3
plane_cohorts <- inner_join(flights,
  select(planes, tailnum, plane_year = year),
  by = "tailnum"
) %>%
  mutate(age = year - plane_year) %>%
  filter(!is.na(age)) %>%
  mutate(age = if_else(age > 25, 25L, age)) %>%
  group_by(age) %>%
  summarise(
    dep_delay_mean = mean(dep_delay, na.rm = TRUE),
    dep_delay_sd = sd(dep_delay, na.rm = TRUE),
    arr_delay_mean = mean(arr_delay, na.rm = TRUE),
    arr_delay_sd = sd(arr_delay, na.rm = TRUE),
    n_arr_delay = sum(!is.na(arr_delay)),
    n_dep_delay = sum(!is.na(dep_delay))
  )
plane_cohorts %>% print
# A tibble: 26 x 7
     age dep_delay_mean dep_delay_sd arr_delay_mean arr_delay_sd n_arr_delay
   <int>          <dbl>        <dbl>          <dbl>        <dbl>       <int>
 1     0          10.6          34.4           4.01         38.5        4611
 2     1           9.64         31.9           2.85         37.4        7196
 3     2          11.8          41.8           5.70         46.8        6008
 4     3          12.5          37.5           5.18         41.9        3771
 5     4          11.0          35.5           4.92         39.7        6572
 6     5          13.2          39.6           5.57         43.9       17731
 7     6          13.7          41.4           7.54         45.2       15142
 8     7          14.6          41.3           9.90         45.1       12998
 9     8          14.7          41.5           9.80         45.4       14064
10     9          16.4          44.2          10.2          48.0       15273
# ... with 16 more rows, and 1 more variable: n_dep_delay <int>
In [88]:
ggplot(plane_cohorts, aes(x = age, y = dep_delay_mean)) +
  geom_point() + 
    xlab("Age of Plane (years)") + ylab("Mean Departure Delay (minutes)")
In [89]:
ggplot(plane_cohorts, aes(x = age, y = arr_delay_mean)) +
  geom_point() + 
    xlab("Age of Plane (years)") + ylab("Mean Arrival Delay (minutes)")
In [90]:
# 4
flight_weather <- flights %>% 
    inner_join(weather, by = c("origin" = "origin",
                              "year" = "year",
                              "month" = "month",
                              "day" = "day",
                              "hour" = "hour"))
head(flight_weather)
A tibble: 6 × 29
yearmonthdaydep_timesched_dep_timedep_delayarr_timesched_arr_timearr_delaycarrier...tempdewphumidwind_dirwind_speedwind_gustprecippressurevisibtime_hour.y
<int><int><int><int><int><dbl><int><int><dbl><chr>...<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dttm>
201311517515 2 830 819 11UA...39.0228.0464.4326012.65858 NA01011.9102013-01-01 05:00:00
201311533529 4 850 830 20UA...39.9224.9854.8125014.9601421.8648201011.4102013-01-01 05:00:00
201311542540 2 923 850 33AA...39.0226.9661.6326014.96014 NA01012.1102013-01-01 05:00:00
201311544545-110041022-18B6...39.0226.9661.6326014.96014 NA01012.1102013-01-01 05:00:00
201311554600-6 812 837-25DL...39.9224.9854.8126016.1109223.0156001011.7102013-01-01 06:00:00
201311554558-4 740 728 12UA...39.0228.0464.4326012.65858 NA01011.9102013-01-01 05:00:00
In [91]:
flight_weather %>%
    group_by(precip) %>% 
    summarise(delay = mean(dep_delay, na.rm = T)) %>%
    ggplot(aes(precip, delay)) + geom_line() + geom_point()
In [92]:
flight_weather %>%
    ungroup() %>%
    mutate(visib_cat = cut_interval(visib, n = 10)) %>%
    group_by(visib_cat) %>%
    summarise(dep_delay = mean(dep_delay, na.rm = T)) %>%
    ggplot(aes(visib_cat, dep_delay)) + geom_point()
In [93]:
# 5
library(viridis)
flights %>%
  filter(year == 2013, month == 6, day == 13) %>%
  group_by(dest) %>%
  summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap() + scale_colour_viridis()
Warning message:
"Removed 3 rows containing missing values (geom_point)."

merge() & SQL¶

inner_join(x, y) : merge(x, y)
left_join(x, y) : merge(x, y, all.x = T)
right_join(x, y) : merge(x, y, all.y = T)
full_join(x, y) : merge(x, y, all.x = T, all.y = T)
inner_join(x, y, by = "z") : SELECT * FROM x INNER JOIN y USING (z)
left_join(x, y, by = "z") : SELECT * FROM x LEFT OUTER JOIN y USING (z)
right_join(x, y, by = "z") : SELECT * FROM x RIGHT OUTER JOIN y USING (z)
full_join(x, y, by = "z") : SELECT * FROM x FULL OUTER JOIN JOIN y USING (z)

필터링 조인¶

  • semi_join(x, y) : y와 매치되는 x의 모든 관측값을 보존 (X값만 출력)
  • anti_join(x, y) : y와 매치되는 x의 모든 관측값을 삭제 (X값만 출력)
In [94]:
top_dest <- flights %>%
    count(dest, sort = T) %>% head(10)
In [95]:
flights %>% semi_join(top_dest) %>% print
Joining, by = "dest"
# A tibble: 141,145 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      542            540         2      923            850
 2  2013     1     1      554            600        -6      812            837
 3  2013     1     1      554            558        -4      740            728
 4  2013     1     1      555            600        -5      913            854
 5  2013     1     1      557            600        -3      838            846
 6  2013     1     1      558            600        -2      753            745
 7  2013     1     1      558            600        -2      924            917
 8  2013     1     1      558            600        -2      923            937
 9  2013     1     1      559            559         0      702            706
10  2013     1     1      600            600         0      851            858
# ... with 141,135 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
In [96]:
flights %>% anti_join(planes, by = "tailnum") %>% count(tailnum, sort = T) %>% head
A tibble: 6 × 2
tailnumn
<chr><int>
NA 2512
N725MQ 575
N722MQ 513
N723MQ 507
N713MQ 483
N735MQ 396
In [99]:
# 10.5.1
# 2
planes_gte100 <- flights %>%
  filter(!is.na(tailnum)) %>%
  group_by(tailnum) %>%
  count() %>%
  filter(n >= 100) %>% print()
# A tibble: 1,217 x 2
# Groups:   tailnum [1,217]
   tailnum     n
   <chr>   <int>
 1 N0EGMQ    371
 2 N10156    153
 3 N10575    289
 4 N11106    129
 5 N11107    148
 6 N11109    148
 7 N11113    138
 8 N11119    148
 9 N11121    154
10 N11127    124
# ... with 1,207 more rows
In [109]:
#3
fueleconomy::vehicles %>% print
fueleconomy::common %>% print
# A tibble: 33,442 x 12
      id make  model        year class trans drive   cyl displ fuel    hwy   cty
   <dbl> <chr> <chr>       <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
 1 13309 Acura 2.2CL/3.0CL  1997 Subc~ Auto~ Fron~     4   2.2 Regu~    26    20
 2 13310 Acura 2.2CL/3.0CL  1997 Subc~ Manu~ Fron~     4   2.2 Regu~    28    22
 3 13311 Acura 2.2CL/3.0CL  1997 Subc~ Auto~ Fron~     6   3   Regu~    26    18
 4 14038 Acura 2.3CL/3.0CL  1998 Subc~ Auto~ Fron~     4   2.3 Regu~    27    19
 5 14039 Acura 2.3CL/3.0CL  1998 Subc~ Manu~ Fron~     4   2.3 Regu~    29    21
 6 14040 Acura 2.3CL/3.0CL  1998 Subc~ Auto~ Fron~     6   3   Regu~    26    17
 7 14834 Acura 2.3CL/3.0CL  1999 Subc~ Auto~ Fron~     4   2.3 Regu~    27    20
 8 14835 Acura 2.3CL/3.0CL  1999 Subc~ Manu~ Fron~     4   2.3 Regu~    29    21
 9 14836 Acura 2.3CL/3.0CL  1999 Subc~ Auto~ Fron~     6   3   Regu~    26    17
10 11789 Acura 2.5TL        1995 Comp~ Auto~ Fron~     5   2.5 Prem~    23    18
# ... with 33,432 more rows
# A tibble: 347 x 4
   make  model                n years
   <chr> <chr>            <int> <int>
 1 Acura Integra             42    16
 2 Acura Legend              28    10
 3 Acura MDX 4WD             12    12
 4 Acura NSX                 28    14
 5 Acura TSX                 27    11
 6 Audi  A4                  49    19
 7 Audi  A4 Avant quattro    49    15
 8 Audi  A4 quattro          66    19
 9 Audi  A6                  20    19
10 Audi  A6 Avant quattro    12    12
# ... with 337 more rows
In [111]:
fueleconomy::vehicles %>% 
semi_join(fueleconomy::common, c("make", "model")) %>% 
print
# A tibble: 14,531 x 12
      id make  model    year class     trans drive   cyl displ fuel    hwy   cty
   <dbl> <chr> <chr>   <dbl> <chr>     <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
 1  1833 Acura Integra  1986 Subcompa~ Auto~ Fron~     4   1.6 Regu~    28    22
 2  1834 Acura Integra  1986 Subcompa~ Manu~ Fron~     4   1.6 Regu~    28    23
 3  3037 Acura Integra  1987 Subcompa~ Auto~ Fron~     4   1.6 Regu~    28    22
 4  3038 Acura Integra  1987 Subcompa~ Manu~ Fron~     4   1.6 Regu~    28    23
 5  4183 Acura Integra  1988 Subcompa~ Auto~ Fron~     4   1.6 Regu~    27    22
 6  4184 Acura Integra  1988 Subcompa~ Manu~ Fron~     4   1.6 Regu~    28    23
 7  5303 Acura Integra  1989 Subcompa~ Auto~ Fron~     4   1.6 Regu~    27    22
 8  5304 Acura Integra  1989 Subcompa~ Manu~ Fron~     4   1.6 Regu~    28    23
 9  6442 Acura Integra  1990 Subcompa~ Auto~ Fron~     4   1.8 Regu~    24    20
10  6443 Acura Integra  1990 Subcompa~ Manu~ Fron~     4   1.8 Regu~    26    21
# ... with 14,521 more rows
In [123]:
# 4
worst_hours <- flights %>%
  mutate(hour = sched_dep_time %/% 100) %>%
  group_by(origin, year, month, day, hour) %>%
  summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(desc(dep_delay)) %>%
  slice(1:48)
worst_hours %>% print
`summarise()` has grouped output by 'origin', 'year', 'month', 'day'. You can override using the `.groups` argument.
# A tibble: 48 x 6
   origin  year month   day  hour dep_delay
   <chr>  <int> <int> <int> <dbl>     <dbl>
 1 LGA     2013     7    28    21      280.
 2 EWR     2013     2     9    10      269 
 3 EWR     2013     2     9     9      266 
 4 LGA     2013     9     2    16      250.
 5 LGA     2013     7    22    18      246.
 6 LGA     2013     7    28    19      240.
 7 JFK     2013     4    10    21      237 
 8 LGA     2013     9    12    20      226.
 9 EWR     2013     3     8    12      225.
10 LGA     2013    12     5    11      221.
# ... with 38 more rows
In [126]:
weather_most_delayed <- semi_join(weather, worst_hours, 
                                  by = c("origin", "year",
                                         "month", "day", "hour")) %>% print
# A tibble: 48 x 15
   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
 1 EWR     2013     2     9     9  27.0  17.1  65.8      310      13.8 
 2 EWR     2013     2     9    10  28.0  16.0  60.1      310      19.6 
 3 EWR     2013     2     9    11  28.9  16.0  57.9      320      28.8 
 4 EWR     2013     3     8    12  33.8  32    95.8      320       9.21
 5 EWR     2013     3     8    14  34.0  33.1  96.5      320       8.06
 6 EWR     2013     6    30    16  80.1  73.0  79.2      160       8.06
 7 EWR     2013     7    10    19  86    69.1  57.1      250      13.8 
 8 EWR     2013     7    28    19  73.4  71.6  94.1       NA       6.90
 9 EWR     2013     9    12    17  84.0  73.0  69.6      180       5.75
10 EWR     2013     9    12    19  78.8  71.6  93.5      260      18.4 
# ... with 38 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
#   pressure <dbl>, visib <dbl>, time_hour <dttm>
In [128]:
select(weather_most_delayed, temp, wind_speed, precip) %>%
  print
# A tibble: 48 x 3
    temp wind_speed precip
   <dbl>      <dbl>  <dbl>
 1  27.0      13.8    0   
 2  28.0      19.6    0   
 3  28.9      28.8    0   
 4  33.8       9.21   0.06
 5  34.0       8.06   0.05
 6  80.1       8.06   0   
 7  86        13.8    0   
 8  73.4       6.90   0.08
 9  84.0       5.75   0   
10  78.8      18.4    0.23
# ... with 38 more rows
In [129]:
ggplot(weather_most_delayed, aes(x = precip, y = wind_speed, color = temp)) +
  geom_point()
In [135]:
# 5
anti_join(flights, airports, by = c("dest" = "faa")) %>% distinct(dest)
A tibble: 4 × 1
dest
<chr>
BQN
SJU
STT
PSE
In [139]:
anti_join(airports, flights, by = c("faa" = "dest")) %>% print
# A tibble: 1,357 x 8
   faa   name                             lat    lon   alt    tz dst   tzone    
   <chr> <chr>                          <dbl>  <dbl> <dbl> <dbl> <chr> <chr>    
 1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/~
 2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/~
 3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/~
 4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/~
 5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/~
 6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/~
 7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/~
 8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/~
 9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/~
10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/~
# ... with 1,347 more rows
In [145]:
# 6
planes_carriers <-
  flights %>%
  filter(!is.na(tailnum)) %>%
  distinct(tailnum, carrier) %>% print
# A tibble: 4,060 x 2
   carrier tailnum
   <chr>   <chr>  
 1 UA      N14228 
 2 UA      N24211 
 3 AA      N619AA 
 4 B6      N804JB 
 5 DL      N668DN 
 6 UA      N39463 
 7 B6      N516JB 
 8 EV      N829AS 
 9 B6      N593JB 
10 AA      N3ALAA 
# ... with 4,050 more rows
In [144]:
planes_carriers %>%
  count(tailnum) %>%
  filter(n > 1) %>% print
# A tibble: 17 x 2
   tailnum     n
   <chr>   <int>
 1 N146PQ      2
 2 N153PQ      2
 3 N176PQ      2
 4 N181PQ      2
 5 N197PQ      2
 6 N200PQ      2
 7 N228PQ      2
 8 N232PQ      2
 9 N933AT      2
10 N935AT      2
11 N977AT      2
12 N978AT      2
13 N979AT      2
14 N981AT      2
15 N989AT      2
16 N990AT      2
17 N994AT      2
In [142]:
carrier_transfer_tbl <- planes_carriers %>%
  group_by(tailnum) %>%
  filter(n() > 1) %>%
  left_join(airlines, by = "carrier") %>%
  arrange(tailnum, carrier)
In [143]:
carrier_transfer_tbl %>% print
# A tibble: 34 x 3
# Groups:   tailnum [17]
   carrier tailnum name                    
   <chr>   <chr>   <chr>                   
 1 9E      N146PQ  Endeavor Air Inc.       
 2 EV      N146PQ  ExpressJet Airlines Inc.
 3 9E      N153PQ  Endeavor Air Inc.       
 4 EV      N153PQ  ExpressJet Airlines Inc.
 5 9E      N176PQ  Endeavor Air Inc.       
 6 EV      N176PQ  ExpressJet Airlines Inc.
 7 9E      N181PQ  Endeavor Air Inc.       
 8 EV      N181PQ  ExpressJet Airlines Inc.
 9 9E      N197PQ  Endeavor Air Inc.       
10 EV      N197PQ  ExpressJet Airlines Inc.
# ... with 24 more rows

조인 문제¶

  • 조인을 원활하게 하기 위해서 주어진 데이터에 수행해야 하는 몇가지 작업
  1. 각 테이블에서 기본키를 구성하는 벼수 식별
  2. 기본키의 변수들에 결측값 확인
  3. 외래키가 다른 테이블의 기본키와 매칭되는 지 확인 (anti_join)

집합 연산¶

intersect(x, y) : 교집합
union(x, y) : 합집합
setdiff(x, y) : 차집합
In [146]:
df1 <- tribble(
    ~x, ~y,
    1, 1,
    2, 1
)
df2 <- tribble(
    ~x, ~y,
    1, 1,
    1, 2
)
In [147]:
intersect(df1, df2)
A tibble: 1 × 2
xy
<dbl><dbl>
11
In [148]:
union(df1, df2)
A tibble: 3 × 2
xy
<dbl><dbl>
11
21
12
In [151]:
setdiff(df1, df2) ; setdiff(df2, df1)
A tibble: 1 × 2
xy
<dbl><dbl>
21
A tibble: 1 × 2
xy
<dbl><dbl>
12